import altair as alt
import pandas as pd
import numpy as np
import io
from google.colab import drive
drive.mount('/content/drive')
review_by_date = pd.read_csv('/content/drive/My Drive/Data Viz/reviews.csv')
review_by_date['year'] = pd.DatetimeIndex(review_by_date['date']).year
review_by_date['month'] = pd.DatetimeIndex(review_by_date['date']).month
review_by_year = review_by_date.groupby('year').agg({'listing_id':['nunique']}).reset_index()
review_by_year.columns=["year","num_reviews"]
review_barplot = alt.Chart(review_by_year).mark_bar(size=30).encode(
x=alt.X('year', type='ordinal', axis=alt.Axis(title='Year')),
y=alt.Y('num_reviews', type='quantitative', axis=alt.Axis(title='Number of listings with reviews'))
).properties(height=400, width=600,
title={
'text':['Number of Airbnb Listings With Reviews in Chicago Over Time'],
"fontSize": 22,
'subtitle':['The number of Airbnb listings in Chicago has grown drastically in the last decade.',
'The growth was concentrated in the years from 2015-2019 where listing volume grew almost 7 times.'],
"subtitleColor": "gray",
"subtitleFontSize": 18,
"align": "left",
"anchor": "start",
"offset": 30
}
).configure(background='#f0ead6'
).configure_mark(color='#008080'
).configure_axis(
labelFontSize=12,
titleFontSize=15)
review_barplot
Data Source: Inside Airbnb
*Note: Number of listings with reviews is a proxy for number of listings (due to time series data not available).
The city of Chicago attracts more than 50 million tourists annually. Tourism demand explains the growth of Airbnb, the San-Francisco based online short-term rental platform, in Chicago. In only a decade, Airbnb has expanded tremendously, leading to additional rental revenues as well as increased competition with local hotels. Using data from Inside Airbnb, a third-party non-commercial database constructed through periodical scraping, we see that the majority of listing growth on Airbnb in Chicago happened in the last few year and the momentum does not show any signs of stopping. Given this growth, our policy question becomes ever more important because as the number of Airbnb listings increases, the impact of it on local economic development will become stronger.
listings = pd.read_csv('/content/drive/My Drive/Data Viz/listings_detailed.csv')
!pip install sodapy
from sodapy import Socrata
!pip install geopandas
from shapely.geometry import Polygon
import geopandas as gpd
def get_ca_spatial_data(key):
'''
Download tract spatial data using the API from
Chicago Open Data Portal
'''
client = Socrata("data.cityofchicago.org", None)
results = client.get("74p9-q2aq", limit=100000)
results_df = pd.DataFrame.from_records(results)
return results_df
def get_polygon(row):
'''
Turn a list of coordinates into a polygon object
'''
coordinates = row['the_geom']['coordinates'][0][0]
polygon = Polygon(coordinates)
return polygon
#Getting the geodataframe for mapping the base
ca_df = get_ca_spatial_data('igwz-8jzy')
ca_df['geometry'] = ca_df.apply(get_polygon, axis=1)
ca_geodf = gpd.GeoDataFrame(ca_df)
ca_geodf = ca_geodf[['commarea', 'geometry']]
#Getting the dataframe for mapping the points
lst_df = listings[['id', 'room_type', 'latitude', 'longitude']]
#Inspired by this medium post https://medium.com/dataexplorations/creating-choropleth-maps-in-altair-eeb7085779a1
base = alt.Chart(ca_geodf).mark_geoshape(
stroke='gray',
strokeWidth=1,
fill=None
).encode(
).properties(
width=800,
height=800
).properties(
title={'text':['Visualizing Airbnb Listings by Room Type in Chicago Neighborhoods'],
'subtitle':['The majority of Airbnb listings concentrate on the North Side of Chicago especially in areas along',
'the CTA Blue and Red Lines. Northern listings are dominated by entire home or apartment rentals.',
'On the South Side, most Airbnb listings are in Eastern neighborhoods near the lake',
'and there is a balanced mix between private room and entire home/apartment rentals.'],
'fontSize': 22,
'subtitleColor': 'gray',
'subtitleFontSize': 18,
'align': 'left',
'anchor': 'start',
'offset': 25
}
)
pts = alt.Chart(lst_df).mark_point(size=8, filled=True).encode(
latitude='latitude',
longitude='longitude',
color=alt.Color("room_type",
legend=alt.Legend(title="Airbnb rental type by color"),
scale = alt.Scale(domain=['Entire home/apt', 'Hotel room', 'Private room', 'Shared room'],
range=['#7fc97f', 'yellow', '#e7298a', 'blue']
)
),
opacity=alt.value(0.7)
)
alt.layer(base + pts).configure(background='#f0ead6')
Data Source: Chicago Open Data Portal (for geoshape file for community areas) and Inside Airbnb for Airbnb listing data
The visualization of Airbnb listings on a map of Chicago makes it more evident that Airbnb disproportionally benefits more affluent neighborhoods on the North side of the city. Airbnb listings in the North side concentrate in areas near the Loop, along the lake and along the CTA blue line, where many gentrified neighborhoods such as Bucktown, Wicker Park and Logan Square are located. Along with the scatterplot above, this map provides more concrete evidence that the benefit of Airbnb skews towards richer neighborhoods.
#data processing
capacity = listings[['id','accommodates','price']]
capacity_price = capacity.groupby('accommodates').agg({'price':['median']}).reset_index()
capacity_price.columns = ['capacity', 'median_price']
capacity_num_listings = capacity.groupby('accommodates').agg({'id': ['count']}).reset_index()
capacity_num_listings.columns = ['capacity', 'num_listings']
capacity_price_num_listings = pd.merge(left=capacity_num_listings, right=capacity_price, on='capacity')
capacity_price_num_listings = capacity_price_num_listings[capacity_price_num_listings['capacity']<=16]
capacity_chart = alt.Chart(capacity_price_num_listings).mark_circle().encode(
x=alt.X('median_price:Q', axis=alt.Axis(title="Median Price")),
y=alt.Y("capacity:Q", axis=alt.Axis(title="Number of People The Listing Can Accommodate")),
size=alt.Size('num_listings', legend=alt.Legend(title="Number of listings"), scale=alt.Scale(range=[2, 3000]))
).properties(height=600, width=600,
title={
"text": ['Airbnb Median Price by Listing Capacity'],
"fontSize": 22,
"subtitle":['Listing median price posivitely correlates with capacity for listings that acommodate 1 to 13 people.',
'After this mark, the correlation seems to break down.'],
"subtitleColor": "gray",
"subtitleFontSize": 18,
"align": "left",
"anchor": "start",
"offset": 25
}
).configure(background='#f0ead6'
).configure_mark(color='purple'
).configure_axis(
labelFontSize=12,
titleFontSize=15)
capacity_chart
Data Source: Inside Airbnb
Before looking at the data, our hypothesis is that there will be a posivite correlation between listing capacity and median price, and that the correlation will also demonstrate a diminishing returns. Data corroborate part of that hypothesis. As listing capacity increases from 1 to 13 people, we see a positive correlation between capacity and price. However, as listing capacity reaches beyond 13 people, this correlation breaks down. This could partly be because of lower number of listings with high capacity.
#Cancellation policy vs price histogram
cancellation_policy = listings[['cancellation_policy', 'price']]
cancellation_policy_no_strict = cancellation_policy[(cancellation_policy['cancellation_policy']!='super_strict_30') & (cancellation_policy['cancellation_policy']!= 'super_strict_60')]
#Cancellation policy vs price histogram
price_hist_by_cancellation = alt.Chart(cancellation_policy_no_strict).mark_bar(opacity=0.8, binSpacing=0).encode(
alt.X("price:Q",
bin=alt.Bin(extent=[0, 1000], step=20),
axis=alt.Axis(title='Price Per Night (US$)')),
alt.Y('count()',
stack=None,
axis=alt.Axis(title='Number of Listings')),
alt.Color('cancellation_policy:N'),
facet='cancellation_policy:N'
).properties(title={"text": ["Airbnb Listings Price Histogram By Cancellation Policy"],
"fontSize": 22,
"subtitle": ["The histograms below show the distribution of price per night for Airbnb listings in Chicago. Different cancellation policies were marked by different colors."],
"subtitleColor": "gray",
"subtitleFontSize": 18,
"align": "left",
"anchor": "start",
"offset": 25
}
).configure(background='#f0ead6'
).configure_axis(
labelFontSize=12,
titleFontSize=15,
grid=False)
price_hist_by_cancellation
#data processing
host = listings[['id','host_id']]
host = host.groupby('host_id').agg({'id':['count']}).reset_index()
host.columns = ['host_id', 'num_listings']
host['listing_range'] = pd.cut(host['num_listings'], bins=[0, 1, 5, float('Inf')], labels=['1 listing', '2-5 listings', 'more than 5 listings'])
host_by_listing_range = host.groupby('listing_range').agg({'host_id':['count']}).reset_index()
room_type_by_host = listings[['host_id', 'id', 'room_type', 'property_type']]
room_type_by_host = pd.merge(left=room_type_by_host, right=host, on='host_id')
alt.data_transformers.disable_max_rows() #disabling max rows to handle big dataset (>5000 rows)
room_type_stacked_bar = alt.Chart(room_type_by_host, title='Airbnb listings by host ownership and room type').mark_bar(size=30).encode(
alt.Y('listing_range', type='nominal',
sort=['1 listing', '2-5 listings', 'more than 5 listings'],
axis=alt.Axis(title='Host Ownership Range')),
alt.X('count()', type='quantitative',
axis=alt.Axis(title='Number of Listings')),
color=alt.Color("room_type",
legend=alt.Legend(title="Airbnb rental type by color"),
scale = alt.Scale(domain=['Entire home/apt', 'Hotel room', 'Private room', 'Shared room'],
range=['#7fc97f', 'yellow', '#e7298a', 'blue']
)
),
order=alt.Order('room_type', sort='ascending')
).properties(height=250, width=600,
title={"text": ["Airbnb Listings By Host Ownership and Room Type"],
"fontSize": 22,
"subtitle": ["The majority of Airbnbn listings in Chicago are entire home/apartment regardless of the number of listings the host owns"],
"subtitleColor": "gray",
"subtitleFontSize": 18,
"align": "left",
"anchor": "start",
"offset": 25
}
).configure(background='#f0ead6'
).configure_mark(color='#008080'
).configure_axis(
labelFontSize=12,
titleFontSize=15)
room_type_stacked_bar
Data Source: Inside Airbnb
In this graph, I want to explore whether Airbnb listings in Chicago are actually short-term rentals as the company advertises. If a person owns multiple listings, or rents out an entire home or apartment for an extended period, we might have reason to believe that this person is using Airbnb for business purposes. According to data from Inside Airbnb as of November 2019, more than 50% of Airbnb listings for Chicago are owned by a host that has multiple listings. The majority of listings in Chicago are entire home or apartment. This evidence gives credibility to the claim that Airbnb posed some threat to local hotels, as Chicago Airbnb hosts showed signs that they were operating Airbnb as a business.
Data Source: Chicago Open Data Portal - Socioeconomic data (2008-2012) and Inside Airbnb
In an article from the Chicago Tribune, the reporters argued that Airbnb disproportionally benefited white neighborhoods in major cities based on a study by Purdue University. I used data from the Chicago Open Data Portal and Inside Airbnb to validate this argument. Evidence shows that there is a negative correlation between median listing price on Airbnb and poverty rate, suggesting that residents from more well-off neighborhoods can benefit more from renting out a spare room in their apartments or houses.
rr_and_review = listings[['number_of_reviews', 'review_scores_rating', 'review_scores_location', 'host_response_rate', 'price']]
rr_and_review['host_response_rate'] = rr_and_review['host_response_rate'].str.rstrip('%')
rr_and_review['host_response_rate'] = rr_and_review['host_response_rate'].astype('float')
selection = alt.selection_single()
overall_and_review_chart = alt.Chart(rr_and_review).mark_point(size=20, color='purple').add_selection(selection).encode(
x=alt.X('review_scores_rating:Q',title='Overall Rating'),
y=alt.Y('number_of_reviews:Q',title='Number of Reviews'),
opacity=alt.condition(selection, alt.value(0.8), alt.value(0.1))
).properties(
height=400,
width=400
)
loc_and_review_chart = alt.Chart(rr_and_review).mark_point(size=20, color='purple').add_selection(selection).encode(
x=alt.X('review_scores_location:Q',title='Location Review Score'),
y=alt.Y('number_of_reviews:Q',title='Number of Reviews'),
opacity=alt.condition(selection, alt.value(0.8), alt.value(0.1))
).properties(
height=400,
width=400
)
rr_and_review_chart = alt.Chart(rr_and_review).mark_point(size=20, color='purple').add_selection(selection).encode(
x=alt.X('host_response_rate:Q',title='Host Response Rate'),
y=alt.Y('number_of_reviews:Q',title='Number of Reviews'),
opacity=alt.condition(selection, alt.value(0.8), alt.value(0.1))
).properties(
height=400,
width=400
)
overall_chart = alt.hconcat(overall_and_review_chart, loc_and_review_chart, rr_and_review_chart
).configure(background='#f0ead6'
).properties(title={"text": ['Airbnb Popularity And Different Rating Measures'],
"fontSize": 22,
"subtitle": ["In Chicago, overall ratings fall above 80% and location ratings fall above 8/10.",
"Host Response Rate varies for listings with fewer reviews. Listings with more than 200 reviews consistently have above 90% response rate."],
"subtitleColor": "gray",
"subtitleFontSize": 18,
"align": "left",
"anchor": "start",
"offset": 20
}
).configure_axis(labelFontSize=12, titleFontSize=15)
overall_chart
Data Source: Inside Airbnb
In this part of the visualization, we explore the correlation between Airbnb listings' popularity, proxied through the total number of reviews, and different measurements of ratings. First of all, data show a positive correlation between overall rating and location review score and number of reviews. Host response rate data have a wider range than rating scores. For example, the response rate ranges from 0% to 100% though the majority of the hosts have higher than 80% response rate. For listings with more than 300 reviews, the response rate is consistently above 90%.
#processing data
census_data = pd.read_csv('/content/drive/My Drive/Data Viz/census_data_0812.csv')
median_price = listings[['id', 'neighbourhood_cleansed', 'price']]
median_price = median_price.groupby('neighbourhood_cleansed').agg({'price':['median']}).reset_index()
median_price.columns=['Neighborhood', 'Median Listing Price']
census_data = census_data.merge(right=median_price, left_on='COMMUNITY AREA NAME', right_on='Neighborhood')
poverty_data_for_scatterplot = census_data[['Neighborhood', 'PERCENT HOUSEHOLDS BELOW POVERTY', 'Median Listing Price']]
poverty_scatterplot = alt.Chart(poverty_data_for_scatterplot,
).mark_point(size=20, color='purple').encode(
x=alt.X('PERCENT HOUSEHOLDS BELOW POVERTY', type='quantitative',
title='Percent of households below the poverty line (%)'),
y=alt.Y('Median Listing Price', type='quantitative',
title='Median Listing Price ($)'),
tooltip=['Neighborhood']
).configure(background='#f0ead6'
).properties(height=500, width=500,
title={"text": ['Airbnb Median Listing Price at Different Poverty Levels', 'in Chicago Neighborhoods'],
"fontSize": 22,
"subtitle": ["In Chicago, neighborhoods with lower poverty rates tend to have higher", "median listing prices on Airbnb"],
"subtitleColor": "gray",
"subtitleFontSize": 18,
"align": 'left',
"anchor": "start",
"offset": 25
}
).configure_axis(
labelFontSize=12,
titleFontSize=15)
poverty_scatterplot
#processing the data
ghost_hotels = listings[['id', 'neighbourhood_cleansed', 'zipcode', 'longitude', 'latitude', 'room_type', 'price', 'availability_365']]
ghost_hotels['annual_availability_pct'] = ghost_hotels['availability_365']/365
ghost_hotels['is_ghost_hotel'] = np.where((ghost_hotels['room_type']=='Entire home/apt') & (ghost_hotels['annual_availability_pct']>=0.5), 1, 0)
gh_by_neighborhood = ghost_hotels.groupby('neighbourhood_cleansed').agg({'is_ghost_hotel':['sum']}).reset_index()
gh_by_neighborhood.columns = ['neighborhood', 'num_ghost_hotels']
listings_by_neighborhood = ghost_hotels.groupby('neighbourhood_cleansed').agg({'id':['count']}).reset_index()
listings_by_neighborhood.columns = ['neighborhood', 'total_listings']
by_neighborhood = gh_by_neighborhood.merge(right=listings_by_neighborhood, on='neighborhood')
by_neighborhood['pct_ghost_hotel_listing'] = by_neighborhood['num_ghost_hotels']/ by_neighborhood['total_listings']*100
by_neighborhood = by_neighborhood.merge(right=census_data, right_on='COMMUNITY AREA NAME', left_on='neighborhood')
by_neighborhood = by_neighborhood[['neighborhood', 'pct_ghost_hotel_listing', 'Community Area Number', 'PERCENT OF HOUSING CROWDED']]
by_neighborhood.columns = ['neighborhood', 'pct_ghost_hotel_listing', 'commarea_number', 'pct_housing_crowded']
by_neighborhood['commarea_number'] = by_neighborhood['commarea_number'].astype('int').astype('str')
ca_df_short = ca_df[['the_geom', 'commarea', 'geometry']]
by_neighborhood_df = by_neighborhood.merge(right=ca_df_short, left_on='commarea_number', right_on='commarea')
by_neighborhood_geodf = gpd.GeoDataFrame(by_neighborhood_df)
import json
by_neighborhood_json = json.loads(by_neighborhood_geodf.to_json())
by_neighborhood_data = alt.Data(values=by_neighborhood_json['features'])
socio_choro_map = alt.Chart(by_neighborhood_data).mark_geoshape(
).encode(alt.Color('properties.pct_housing_crowded:Q',
title='Percent Of Housing Crowded',
scale=alt.Scale(scheme='goldgreen', domain=[0,30]))
).properties(height=600,
width=600,
title={'text':['Distribution of Airbnb "Ghost Hotels" and', 'Percentage of Housing Crowded in Chicago Neighborhoods'],
'subtitle':['A "ghost hotel" is an Airbnb listing that is an entire home/apartment and rented out for more than half of the year.',
'Each ghost hotel is represented by a purple dot on the map of Chicago. Most ghost hotels are located in the North',
'side of the city. Neighborhoods that suffer from crowded housing do not seem to have many "ghost hotel" listings.'],
'fontSize': 25,
'subtitleColor': 'gray',
'subtitleFontSize': 15,
'align':'left',
'anchor': 'start',
'offset': 25}
)
ghost_hotels_map_data = ghost_hotels[ghost_hotels['is_ghost_hotel']==1]
ghost_hotels_map_data = ghost_hotels_map_data[['longitude', 'latitude', 'id']]
gh_pts = alt.Chart(ghost_hotels_map_data).mark_point(size=2, color='#6A0DAD').encode(
latitude='latitude',
longitude='longitude',
)
alt.layer(socio_choro_map + gh_pts).configure(background='#f0ead6')
Data Source: Chicago Open Data Portal (for geoshape file for community areas) and Inside Airbnb for Airbnb listing data
Airbnb listings that are long-term rentals strain the housing stock for a neighborhood because a home or apartment that could be rented out would be reserved instead for tourists. The term "ghost hotel" was coined to describe such long-term rentals that act like a hotel, but are not registered as such. For the simplicity of this analysis, I define a "ghost hotel" as an Airbnb listing that is rented out in its entirity (entire home or apartment) for more than half of the year. These listings tend to concentrate in neighborhoods in the North Side of the city, close to downtown and public transit and thus convenient for tourists.
Theme:
Title font size: 22
Title font color: black
Subtitle font size: 18
Subtitle font color: gray
Background color: #f0ead6'
Mark color for bar chart: teal
Mark color for point (scatterplot): purple